library(data.table)
library(magrittr)
library(ggplot2)
library(dplyr)
library(stringr)
library(ggplot2)
library(gridExtra)
library(ggExtra)
library(GGally)
library(caret)
library(glmnet)
library(geosphere)
library(InformationValue)
library(ggcorrplot)
teams <- read.csv("Data/Teams.csv")
seasons <- read.csv("Data/Seasons.csv")
seeds <- read.csv("Data/NCAATourneySeeds.csv")
conferences <- read.csv("Data/Conferences.csv")
coaches <- read.csv("Data/TeamCoaches.csv")
team_conferences <- read.csv("Data/TeamConferences.csv")
tour_compact_res <- read.csv("Data/NCAATourneyCompactResults.csv")
tour_detail_res <- read.csv("Data/NCAATourneyDetailedResults.csv")
seas_compact_res <- read.csv("Data/RegularSeasonCompactResults.csv")
seas_detail_res <- read.csv("Data/RegularSeasonDetailedResults.csv")
new_master_file <- read.csv("Data/df11.csv")
View each table to understand the variables
head(teams,n=5)
## TeamID TeamName FirstD1Season LastD1Season
## 1 1101 Abilene Chr 2014 2018
## 2 1102 Air Force 1985 2018
## 3 1103 Akron 1985 2018
## 4 1104 Alabama 1985 2018
## 5 1105 Alabama A&M 2000 2018
head(seasons,n=5)
## Season DayZero RegionW RegionX RegionY RegionZ
## 1 1985 10/29/1984 East West Midwest Southeast
## 2 1986 10/28/1985 East Midwest Southeast West
## 3 1987 10/27/1986 East Southeast Midwest West
## 4 1988 11/2/1987 East Midwest Southeast West
## 5 1989 10/31/1988 East West Midwest Southeast
head(seeds,n=5)
## Season Seed TeamID
## 1 1985 W01 1207
## 2 1985 W02 1210
## 3 1985 W03 1228
## 4 1985 W04 1260
## 5 1985 W05 1374
head(conferences,n=5)
## ConfAbbrev Description
## 1 a_sun Atlantic Sun Conference
## 2 a_ten Atlantic 10 Conference
## 3 aac American Athletic Conference
## 4 acc Atlantic Coast Conference
## 5 aec America East Conference
head(coaches,n=5)
## Season TeamID FirstDayNum LastDayNum CoachName
## 1 1985 1102 0 154 reggie_minton
## 2 1985 1103 0 154 bob_huggins
## 3 1985 1104 0 154 wimp_sanderson
## 4 1985 1106 0 154 james_oliver
## 5 1985 1108 0 154 davey_whitney
head(team_conferences,n=5)
## Season TeamID ConfAbbrev
## 1 1985 1114 a_sun
## 2 1985 1147 a_sun
## 3 1985 1204 a_sun
## 4 1985 1209 a_sun
## 5 1985 1215 a_sun
head(tour_compact_res,n=5)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
## 1 1985 136 1116 63 1234 54 N 0
## 2 1985 136 1120 59 1345 58 N 0
## 3 1985 136 1207 68 1250 43 N 0
## 4 1985 136 1229 58 1425 55 N 0
## 5 1985 136 1242 49 1325 38 N 0
head(tour_detail_res,n=5)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA WFGM3
## 1 2003 134 1421 92 1411 84 N 1 32 69 11
## 2 2003 136 1112 80 1436 51 N 0 31 66 7
## 3 2003 136 1113 84 1272 71 N 0 31 59 6
## 4 2003 136 1141 79 1166 73 N 0 29 53 3
## 5 2003 136 1143 76 1301 74 N 1 27 64 7
## WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3
## 1 29 17 26 14 30 17 12 5 3 22 29 67 12 31
## 2 23 11 14 11 36 22 16 10 7 8 20 64 4 16
## 3 14 16 22 10 27 18 9 7 4 19 25 69 7 28
## 4 7 18 25 11 20 15 18 13 1 19 27 60 7 17
## 5 20 15 23 18 20 17 13 8 2 14 25 56 9 21
## LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
## 1 14 31 17 28 16 15 5 0 22
## 2 7 7 8 26 12 17 10 3 15
## 3 14 21 20 22 11 12 2 5 18
## 4 12 17 14 17 20 21 6 6 21
## 5 15 20 10 26 16 14 5 8 19
head(seas_compact_res,n=5)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
## 1 1985 20 1228 81 1328 64 N 0
## 2 1985 25 1106 77 1354 70 H 0
## 3 1985 25 1112 63 1223 56 H 0
## 4 1985 25 1165 70 1432 54 H 0
## 5 1985 25 1192 86 1447 74 H 0
head(seas_detail_res,n=5)
## Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA WFGM3
## 1 2003 10 1104 68 1328 62 N 0 27 58 3
## 2 2003 10 1272 70 1393 63 N 0 26 62 8
## 3 2003 11 1266 73 1437 61 N 0 24 58 8
## 4 2003 11 1296 56 1457 50 N 0 18 38 3
## 5 2003 11 1400 77 1208 71 N 0 30 61 6
## WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3
## 1 14 11 18 14 24 13 23 7 1 22 22 53 2 10
## 2 20 10 19 15 28 16 13 4 4 18 24 67 6 24
## 3 18 17 29 17 26 15 10 5 2 25 22 73 3 26
## 4 9 17 31 6 19 11 12 14 2 18 18 49 6 22
## 5 14 11 13 17 22 12 14 4 4 20 24 62 6 16
## LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
## 1 16 22 10 22 8 18 9 2 20
## 2 9 20 20 25 7 12 8 6 16
## 3 14 23 31 22 9 12 2 5 23
## 4 8 15 17 20 9 19 4 3 23
## 5 17 27 21 15 12 10 7 1 14
head(new_master_file,n=5)
## X.1 X Season DayNum WScore LScore WLoc NumOT WFGM WFGA WFGM3 WFGA3 WFTM
## 1 1 1 2003 134 92 84 N 1 32 69 11 29 17
## 2 2 2 2003 136 80 51 N 0 31 66 7 23 11
## 3 3 3 2003 136 84 71 N 0 31 59 6 14 16
## 4 4 4 2003 136 79 73 N 0 29 53 3 7 18
## 5 5 5 2003 136 76 74 N 1 27 64 7 20 15
## WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3 LFTM LFTA LOR
## 1 26 14 30 17 12 5 3 22 29 67 12 31 14 31 17
## 2 14 11 36 22 16 10 7 8 20 64 4 16 7 7 8
## 3 22 10 27 18 9 7 4 19 25 69 7 28 14 21 20
## 4 25 11 20 15 18 13 1 19 27 60 7 17 12 17 14
## 5 23 18 20 17 13 8 2 14 25 56 9 21 15 20 10
## LDR LAst LTO LStl LBlk LPF TeamID.Type TeamID Region Play_In Seeding
## 1 28 16 15 5 0 22 WTeamID 1421 X 1 16
## 2 26 12 17 10 3 15 WTeamID 1112 Z 0 1
## 3 22 11 12 2 5 18 WTeamID 1113 Z 0 10
## 4 17 20 21 6 6 21 WTeamID 1141 Z 0 11
## 5 26 16 14 5 8 19 WTeamID 1143 W 0 8
## CRType CityID City State ConfAbbrev Home_Address
## 1 <NA> NA <NA> <NA> big_south Asheville, NC, Kimmel Arena
## 2 <NA> NA <NA> <NA> pac_ten Tucson, AZ, McKale Center
## 3 <NA> NA <NA> <NA> pac_ten Tempe, AZ, Wells Fargo Arena
## 4 <NA> NA <NA> <NA> mac Mount Pleasant, MI, McGuirk Arena
## 5 <NA> NA <NA> <NA> pac_ten Berkeley, CA, Haas Pavilion
## Home_Arena Home_City Home_State..Abrv Home_State
## 1 Kimmel Arena Asheville NC North Carolina
## 2 McKale Center Tucson AZ Arizona
## 3 Wells Fargo Arena Tempe AZ Arizona
## 4 McGuirk Arena Mount Pleasant MI Michigan
## 5 Haas Pavilion Berkeley CA California
## Home_Team Home_Conference Home_Capacity Home_Opened Home_Lat
## 1 UNC Asheville Big South 3,200 2011 -82.56731
## 2 Arizona Pac-12 14,545 1973 -110.94607
## 3 Arizona State Pac-12 10,754 1974 -111.93101
## 4 Central Michigan MAC 5,300 1973 -84.77406
## 5 California Pac-12 11,877 1933 -122.26231
## Home_Lon Home_Team_Match Tourney_Address Tourney_Lat Tourney_Lon
## 1 35.61629 unc asheville <NA> -92.25413 38.12314
## 2 32.23030 arizona <NA> -92.25413 38.12314
## 3 33.42450 arizona state <NA> -92.25413 38.12314
## 4 43.58149 central michigan <NA> -92.25413 38.12314
## 5 37.86939 california <NA> -92.25413 38.12314
## Tourney_Round Tourney_State Tourney_Stadium Distance
## 1 <NA> <NA> <NA> 906151.9
## 2 <NA> <NA> <NA> 1818807.5
## 3 <NA> <NA> <NA> 1848419.1
## 4 <NA> <NA> <NA> 874461.0
## 5 <NA> <NA> <NA> 2621042.7
We are interested to train a model to predict tournment win/lose and randomize the winner and loser team into team 1 and 2 and we calculate the probability of team 1 wins. We randomize winning and losing team into team 1 and team 2 (necessary for probabilities later) and drop other ids
rand_tourn_res <- tour_compact_res %>% select(Season,DayNum,WTeamID,LTeamID) %>% mutate(rand = runif(dim(tour_compact_res)[1]),
team1id = ifelse(rand >= 0.5, WTeamID, LTeamID),
team2id = ifelse(rand <0.5, WTeamID, LTeamID),
team1win = ifelse(team1id == WTeamID, 1, 0)) %>%
select(-rand, -WTeamID,-LTeamID)
# rand_tourn_res <- new_master_file %>% select(Season,DayNum,WTeamID,LTeamID,Distance) %>% mutate(rand = runif(dim(tour_compact_res)[1]),
# team1id = ifelse(rand >= 0.5, WTeamID, LTeamID),
# team2id = ifelse(rand <0.5, WTeamID, LTeamID),
# team1win = ifelse(team1id == WTeamID, 1, 0)) %>%
# select(-rand, -WTeamID,-LTeamID)
Then we add seeding information to games as seeding can be an important predictor
# We remove letters from seeds variable and only retain numeric values
seeds_tournment <- seeds %>% mutate(ranking = as.factor((str_replace(Seed, "[A-Z]",""))),
rank_num = as.numeric(str_replace(ranking, ".[a-z]","")))
# Join seeds with the tournament results table by teamid and season for team1
rand_tourn_res <- rand_tourn_res %>%
left_join(
select(seeds_tournment, t1_rank = ranking, t1_rank_n = rank_num, TeamID, Season),
by = c("team1id"="TeamID","Season"="Season"))
# Join seeds with the tournament results table by teamid and season for team2
rand_tourn_res <- rand_tourn_res %>%
left_join(
select(seeds_tournment, t2_rank = ranking, t2_rank_n = rank_num, TeamID, Season),
by = c("team2id"="TeamID","Season"="Season"))
# There are some team has 'NA' seeds and we replac the 'NA' with the average seeds number 8
rand_tourn_res <- rand_tourn_res %>% mutate(t1_rank = ifelse(is.na(t1_rank), 8, t1_rank),
t2_rank = ifelse(is.na(t2_rank), 8, t2_rank),
t1_rank_n = ifelse(is.na(t1_rank_n), 8, t1_rank_n),
t2_rank_n = ifelse(is.na(t2_rank_n), 8, t2_rank_n),
diff_rank = t1_rank_n - t2_rank_n)
Besides seeding information, we are also interested in knowing how certain regular season statistics correlate with winning vs losing.We take the regular season detail and stack it vertically with only 1 column of TeamIDs and a factor indicating whether that row corresponds to a win or a loss.
# Select winning teams variables from seasons detailed result table
win_predictors <- seas_detail_res %>% select(Season,starts_with("W"))
# Create a variable called Res to store the game result, for winning team, the value is 1
win_predictors$Res = 1
# Remove the 'W' initial letter from the column names
names(win_predictors) = substring(names(win_predictors),2)
# Remove Loc variable from the predictors, since Loc variable is only available for winning team,
# and if we stack without removing this var with losing team stats, it generated error of unmatched #dimnesion
win_predictors <- win_predictors%>% select(-Loc)
# Select losing teams variables from seasons detailed result table
lose_predictors <- seas_detail_res %>% select(Season,starts_with("L"))
# Create a variable called Res to store the game result, for losing team, the value is 0
lose_predictors$Res = 0
# Remove the 'L' initial letter from the column names
names(lose_predictors) = substring(names(lose_predictors),2)
# Stack using row binding function winning and losing stats
predictors_all <- rbindlist(list(win_predictors, lose_predictors))
# Correct names for columns Season and Res
predictors_all <- predictors_all %>% rename(Season = eason, Res = es)
#Here we also add some additional game statistcs. These include field goal percentage, free throw percentage.
predictors_all <- predictors_all %>% mutate(FGP = FGM/FGA,FGP2 = (FGM - FGM3) / (FGA - FGA3),FGP3 = FGM3 / FGA3,FTP = FTM / FTA)
# Make Res column a binary category var, 'W' stands for winning, 'L' stands for losing
predictors_all <- predictors_all%>%mutate(Res = factor(ifelse(Res == 1, 'W','L')))
# Create Outcome column still representing game outcome with 1/0 numeric value
predictors_all <- predictors_all%>%mutate(Outcome = ifelse(Res == 'W', 1,0))
Create mean stats for each season, each team using group by and summarise_all function
mean_predictors <- predictors_all %>% group_by(Season,TeamID) %>% summarise_all(funs(mean))
Join the mean stats with the tournament results dataframe, and the constructed model_df is our main dataframe which will be used in model building, note that now that we haven’t added the game venue to each team’s home stadium distance variable –TODO
# Join the team1 stats
model_df <- rand_tourn_res %>% inner_join(
mean_predictors,
by = c("team1id"="TeamID","Season"="Season"))
# Join the team2 stats
model_df <- model_df %>% inner_join(
mean_predictors,
by = c("team2id"="TeamID","Season"="Season"))
# Rename the stats columns with extension _t1 and _t2 to stand for team1 and team2's stats
names(model_df) <- gsub('.x','_t1',names(model_df))
names(model_df) <- gsub('.y','_t2',names(model_df))
Add distance metrics to model_df by merging
model_df <- merge(x=model_df,y=new_master_file%>%select('Season','DayNum','TeamID','Distance'),by.x = c('Season','D_t2Num','team1id'),by.y=c('Season','DayNum','TeamID'),all.x=TRUE)
model_df <- merge(x=model_df,y=new_master_file%>%select('Season','DayNum','TeamID','Distance'),by.x = c('Season','D_t2Num','team2id'),by.y=c('Season','DayNum','TeamID'),all.x=TRUE)
# Rename the stats columns with extension _t1 and _t2 to stand for team1 and team2's stats
names(model_df) <- gsub('.x','_t1',names(model_df))
names(model_df) <- gsub('.y','_t2',names(model_df))
model_df$team1win <- as.factor(model_df$team1win)
ggplot(model_df,aes(x=team1win, y=Distance_t1,fill=team1win))+geom_boxplot()
ggscatmat(predictors_all,columns = 3:5,color = 'Res')
ggscatmat(predictors_all,columns = 6:8,color = 'Res')
ggscatmat(predictors_all,columns = 9:10,color = 'Res')
ggscatmat(predictors_all,columns = 11:13,color = 'Res')
ggscatmat(predictors_all,columns = 14:16,color = 'Res')
ggscatmat(predictors_all,columns = 17:19,color = 'Res')
## Warning in ggscatmat(predictors_all, columns = 17:19, color = "Res"):
## Factor variables are omitted in plot
ggscatmat(predictors_all,columns = 20:21,color = 'Res')
## Warning: Removed 32 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_text).
Correlation plot
# Correlation plot
corr <- round(cor(predictors_all%>%select(-c(Res,FTP))),1)
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
Scatter plot with outcome relationship
# Scatter plot
theme_set(theme_bw())
g <- ggplot(predictors_all, aes(Outcome, FGP)) +
geom_count() +
geom_smooth(method="lm", se=F)
ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "boxplot", fill="transparent")
g
Historic performances
# Bars
teams <- data.table(teams)
seeds <- data.table(seeds)
seas_compact_res <- data.table(seas_compact_res)
tour_compact_res <- data.table(tour_compact_res)
setkey(teams, TeamID)
setkey(seeds, TeamID)
g1 <-
teams[seeds][, one_seed := as.numeric(substr(Seed, 2, 3)) == 1][, sum(one_seed), by = TeamName][order(V1, decreasing = T)][1:20,] %>%
ggplot(aes(x = reorder(TeamName, V1), y = V1,label = V1)) + geom_text(color="white", size=2) +
geom_point(stat = 'identity', fill = 'darkblue',size = 4) + geom_segment(aes(y=0,x = reorder(TeamName, V1), yend = V1,xend =reorder(TeamName, V1) ),color = 'black') +
labs(x = '', y = 'No 1 seeds', title = 'No. 1 Seeds since 1985') +
coord_flip()
setkey(seas_compact_res,WTeamID)
g2 <-
seas_compact_res[teams][, .(wins = .N), by = TeamName][order(-wins)][1:20,] %>%
ggplot(aes(x = reorder(TeamName, wins), y = wins,label = wins)) + geom_text(color="white", size=2) +
geom_point(stat = 'identity', fill = 'darkblue',size = 4) + geom_segment(aes(y=0,x = reorder(TeamName,wins), yend = wins,xend =reorder(TeamName, wins) ),color = 'black') +
labs(x = '', y = 'Wins', title = 'Regular Season Wins since 1985') +
coord_flip()
setkey(tour_compact_res, WTeamID)
g3 <-
tour_compact_res[teams][, .(wins = .N), by = TeamName][order(-wins)][1:20,] %>%
ggplot(aes(x = reorder(TeamName, wins), y = wins,label = wins)) + geom_text(color="white", size=2) +
geom_point(stat = 'identity', fill = 'darkblue',size = 4) + geom_segment(aes(y=0,x = reorder(TeamName,wins), yend = wins,xend =reorder(TeamName, wins) ),color = 'black') +
labs(x = '', y = 'Wins', title = 'Tournament Wins since 1985') +
coord_flip()
g4 <-
tour_compact_res[teams][DayNum == 154, .(wins = .N), by = TeamName][order(-wins)][0:20,] %>%
ggplot(aes(x = reorder(TeamName, wins), y = wins,label = wins)) + geom_text(color="white", size=2)+ geom_point(stat = 'identity', fill = 'darkblue',size = 4) +
geom_segment(aes(y=0,x = reorder(TeamName,wins), yend = wins,xend =reorder(TeamName, wins) ),color = 'black') +
labs(x = '', y = 'Championships', title = 'Tournament Championships since 1985') +
coord_flip()
grid.arrange(g1, g2, g3, g4, nrow = 2)
## Warning: Removed 3 rows containing missing values (geom_text).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_segment).
How conferences compare with each other in terms of winning championships
## Conferences analysis
conf_analysis <- tour_compact_res[team_conferences, on = c(WTeamID = 'TeamID', 'Season'), nomatch = 0
][DayNum == 154, .(ConfAbbrev, wins = .N), by = ConfAbbrev
][conferences, on = 'ConfAbbrev', nomatch = 0]
conf_analysis$overall_type <- ifelse(conf_analysis$wins < 4, "below", "above")
conf_analysis <- conf_analysis[order(conf_analysis$wins),]
conf_analysis$`Description` <- factor(conf_analysis$`Description`, levels = conf_analysis$`Description`)
ggplot(conf_analysis, aes(x=`Description`, y=wins, label=wins)) +
geom_point(stat='identity', aes(col=overall_type), size=6) +
scale_color_manual(name="Champion wins",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
geom_text(color="white", size=2) +
labs(title="Diverging Dot Plot",
subtitle="Champion Wins by Conferences") +
ylim(0, 12) +
coord_flip()
We use 10-fold cross validation to select and build regularized logistic models with glmnet function. There are two types of common regularized logstic models: Lasso and Ridge. A hybrid of the two regularized logstic model is elastincnet model whose mixture level can be controlled with Alpha pamameter. The other parameter is Lambda which is the coefficent to the penalty added to the model loss function. More details of glmnet use can be found at: https://www.rdocumentation.org/packages/glmnet/versions/2.0-16/topics/glmnet
# Fill NA values with 0, the glmnet function doesn't handle with NA values, so it is critical
# to remove or fill NA values
model_df[is.na(model_df)] <- 0
# Since we are predicting classificationm, we use a 2 level factor as the outcome column.
model_df$team1win <- as.factor(model_df$team1win)
# We split 80% of the data into training, and 20% for testing
y <- model_df[,"team1win"]
train_set <- createDataPartition(y, p = 0.8, list = FALSE)
data_train <- model_df[train_set, ]
data_test <- model_df[-train_set, ]
# We define several alpha and lamda hyperparameter valeus to be cross validated evaluated
glmnet_grid <- expand.grid(alpha = c(0, .1, .2, .4, .6, .8, 1),
lambda = seq(.01, .2, length = 20))
# CV 10 fold
glmnet_ctrl <- trainControl(method = "cv", number = 10)
# Fit the model using the hyperparameter candidates and cv.
# Notice that we removed some variables from the predictors
# 'Season','D_t2Num','team1id','team2id' are removed since they don't have predictive powers
# 'diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1' are removed because they can be linearlly inferred #more or less with other predictors, this is called mulicollinearity which must be removed in any #linear based model
glmnet_fit <- train(team1win ~ ., data = data_train%>%select(-c('Season','D_t2Num','team1id','team2id','diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1')),
method = "glmnet",
preProcess = c("center", "scale"),
tuneGrid = glmnet_grid,
trControl = glmnet_ctrl,family = 'binomial')
glmnet_fit
## glmnet
##
## 860 samples
## 42 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (42), scaled (42)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 774, 774, 774, 774, 774, 774, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.0 0.01 0.7162791 0.43253366
## 0.0 0.02 0.7162791 0.43253366
## 0.0 0.03 0.7116279 0.42311315
## 0.0 0.04 0.7104651 0.42061856
## 0.0 0.05 0.7116279 0.42291142
## 0.0 0.06 0.7093023 0.41824263
## 0.0 0.07 0.7081395 0.41594729
## 0.0 0.08 0.7139535 0.42756592
## 0.0 0.09 0.7127907 0.42520518
## 0.0 0.10 0.7127907 0.42515479
## 0.0 0.11 0.7116279 0.42280922
## 0.0 0.12 0.7081395 0.41586750
## 0.0 0.13 0.7081395 0.41592501
## 0.0 0.14 0.7081395 0.41592501
## 0.0 0.15 0.7058140 0.41126570
## 0.0 0.16 0.7058140 0.41126570
## 0.0 0.17 0.7058140 0.41126570
## 0.0 0.18 0.7069767 0.41356123
## 0.0 0.19 0.7093023 0.41810141
## 0.0 0.20 0.7069767 0.41345041
## 0.1 0.01 0.7197674 0.43944483
## 0.1 0.02 0.7104651 0.42074226
## 0.1 0.03 0.7093023 0.41845717
## 0.1 0.04 0.7116279 0.42309600
## 0.1 0.05 0.7104651 0.42070022
## 0.1 0.06 0.7127907 0.42544718
## 0.1 0.07 0.7127907 0.42549770
## 0.1 0.08 0.7174419 0.43479515
## 0.1 0.09 0.7162791 0.43233411
## 0.1 0.10 0.7174419 0.43474753
## 0.1 0.11 0.7174419 0.43480280
## 0.1 0.12 0.7139535 0.42781129
## 0.1 0.13 0.7093023 0.41850393
## 0.1 0.14 0.7081395 0.41615053
## 0.1 0.15 0.7093023 0.41845078
## 0.1 0.16 0.7104651 0.42080144
## 0.1 0.17 0.7104651 0.42086171
## 0.1 0.18 0.7104651 0.42086171
## 0.1 0.19 0.7081395 0.41619517
## 0.1 0.20 0.7093023 0.41855078
## 0.2 0.01 0.7174419 0.43476642
## 0.2 0.02 0.7116279 0.42310082
## 0.2 0.03 0.7127907 0.42545998
## 0.2 0.04 0.7127907 0.42555019
## 0.2 0.05 0.7162791 0.43243179
## 0.2 0.06 0.7093023 0.41836520
## 0.2 0.07 0.7116279 0.42314413
## 0.2 0.08 0.7081395 0.41621547
## 0.2 0.09 0.7069767 0.41398054
## 0.2 0.10 0.7081395 0.41630806
## 0.2 0.11 0.7081395 0.41630820
## 0.2 0.12 0.7081395 0.41630820
## 0.2 0.13 0.7093023 0.41859618
## 0.2 0.14 0.7104651 0.42089166
## 0.2 0.15 0.7046512 0.40913068
## 0.2 0.16 0.7058140 0.41142113
## 0.2 0.17 0.7058140 0.41143621
## 0.2 0.18 0.7034884 0.40677202
## 0.2 0.19 0.7034884 0.40670404
## 0.2 0.20 0.7046512 0.40899209
## 0.4 0.01 0.7116279 0.42312587
## 0.4 0.02 0.7104651 0.42096235
## 0.4 0.03 0.7081395 0.41605441
## 0.4 0.04 0.7034884 0.40678730
## 0.4 0.05 0.7069767 0.41397283
## 0.4 0.06 0.7058140 0.41160157
## 0.4 0.07 0.7081395 0.41624817
## 0.4 0.08 0.7046512 0.40916522
## 0.4 0.09 0.7058140 0.41140300
## 0.4 0.10 0.7034884 0.40674685
## 0.4 0.11 0.7011628 0.40203488
## 0.4 0.12 0.7000000 0.39974949
## 0.4 0.13 0.7000000 0.39962385
## 0.4 0.14 0.7011628 0.40192161
## 0.4 0.15 0.7011628 0.40193869
## 0.4 0.16 0.7011628 0.40194124
## 0.4 0.17 0.7011628 0.40194124
## 0.4 0.18 0.7023256 0.40431513
## 0.4 0.19 0.7034884 0.40654282
## 0.4 0.20 0.7046512 0.40878271
## 0.6 0.01 0.7116279 0.42323765
## 0.6 0.02 0.7034884 0.40688051
## 0.6 0.03 0.7023256 0.40461504
## 0.6 0.04 0.7069767 0.41404086
## 0.6 0.05 0.7093023 0.41846299
## 0.6 0.06 0.7058140 0.41145838
## 0.6 0.07 0.6988372 0.39751711
## 0.6 0.08 0.7000000 0.39975722
## 0.6 0.09 0.6976744 0.39505288
## 0.6 0.10 0.7000000 0.39963613
## 0.6 0.11 0.7023256 0.40431513
## 0.6 0.12 0.7046512 0.40896609
## 0.6 0.13 0.7034884 0.40655223
## 0.6 0.14 0.7000000 0.39950294
## 0.6 0.15 0.7023256 0.40395497
## 0.6 0.16 0.7034884 0.40618010
## 0.6 0.17 0.7046512 0.40814743
## 0.6 0.18 0.7023256 0.40324587
## 0.6 0.19 0.6988372 0.39578035
## 0.6 0.20 0.6965116 0.39096724
## 0.8 0.01 0.7116279 0.42324274
## 0.8 0.02 0.6988372 0.39761054
## 0.8 0.03 0.7058140 0.41167516
## 0.8 0.04 0.7023256 0.40452197
## 0.8 0.05 0.6976744 0.39516907
## 0.8 0.06 0.7000000 0.39977453
## 0.8 0.07 0.6988372 0.39734327
## 0.8 0.08 0.7011628 0.40194105
## 0.8 0.09 0.7034884 0.40655223
## 0.8 0.10 0.7023256 0.40414898
## 0.8 0.11 0.7011628 0.40166683
## 0.8 0.12 0.7011628 0.40133184
## 0.8 0.13 0.7034884 0.40535926
## 0.8 0.14 0.6976744 0.39339440
## 0.8 0.15 0.6976744 0.39326988
## 0.8 0.16 0.6976744 0.39326988
## 0.8 0.17 0.7011628 0.39988645
## 0.8 0.18 0.6941860 0.38530272
## 0.8 0.19 0.6790698 0.35410130
## 0.8 0.20 0.6709302 0.33683580
## 1.0 0.01 0.7058140 0.41160732
## 1.0 0.02 0.6988372 0.39767869
## 1.0 0.03 0.7093023 0.41858603
## 1.0 0.04 0.6976744 0.39532230
## 1.0 0.05 0.6976744 0.39519082
## 1.0 0.06 0.7000000 0.39970863
## 1.0 0.07 0.6988372 0.39719159
## 1.0 0.08 0.7011628 0.40179510
## 1.0 0.09 0.7000000 0.39904645
## 1.0 0.10 0.7011628 0.40054378
## 1.0 0.11 0.6976744 0.39339186
## 1.0 0.12 0.6965116 0.39095716
## 1.0 0.13 0.6976744 0.39328181
## 1.0 0.14 0.6918605 0.38096251
## 1.0 0.15 0.6802326 0.35683738
## 1.0 0.16 0.6709302 0.33712333
## 1.0 0.17 0.6651163 0.32358741
## 1.0 0.18 0.6267442 0.24405011
## 1.0 0.19 0.5511628 0.08393029
## 1.0 0.20 0.5116279 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.1 and lambda = 0.01.
Plot how the model accuracy changes with the mixing percentage(alpha parameter) and regularization parameter(lambda parameter)
plot(glmnet_fit, xvar = "dev", label = TRUE,scales = list(x = list(log = 2)))
We wanted to see the coefficients of the predictors of the best model, and in our cross validation experiments, there are two sets of hyperarameter sets got us the best the accuracy, and thus we have two equivalent best models, two sets of coefficients as below shows
# Best model parameters:
best_param <- glmnet_fit$results %>% filter(glmnet_fit$results$Accuracy ==max(glmnet_fit$results$Accuracy))
best_fit <- glmnet(as.matrix(data_train%>%select(-c('Season','D_t2Num','team1win','team1id','team2id','diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1'))),data_train$team1win,lambda = best_param$lambda, alpha = best_param$alpha,family="binomial")
# Coeffcients
coef(best_fit)
## 43 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 3.919146e+00
## t1_rank -5.120394e-02
## t1_rank_n -7.184733e-02
## t2_rank 6.027526e-02
## t2_rank_n 4.977675e-02
## Score_t1 -3.426987e-03
## FGM_t1 5.107888e-02
## FGA_t1 .
## FGM3_t1 3.070900e-02
## FGA3_t1 .
## FTA_t1 -9.996756e-02
## OR_t1 1.600694e-01
## DR_t1 .
## Ast_t1 1.042594e-02
## TO_t1 -1.181789e-01
## Stl_t1 -3.494907e-02
## Blk_t1 -9.571860e-03
## PF_t1 -1.005106e-02
## FGP_t1 6.068800e+00
## FGP2_t1 -1.139388e+00
## FGP3_t1 -3.101473e+00
## FTP_t1 2.832346e+00
## Outcome_t1 3.895055e-01
## Score_t2 -2.390381e-02
## FGM_t2 .
## FGA_t2 4.339420e-02
## FGM3_t2 .
## FGA3_t2 1.149702e-02
## FTA_t2 3.630485e-02
## OR_t2 -1.530429e-01
## DR_t2 9.942938e-03
## Ast_t2 1.508885e-01
## TO_t2 1.194181e-01
## Stl_t2 -1.139296e-01
## Blk_t2 -1.763140e-01
## PF_t2 -9.864318e-03
## FGP_t2 -6.388998e+00
## FGP2_t2 -7.457883e+00
## FGP3_t2 5.173612e+00
## FTP_t2 -2.461668e+00
## Outcome_t2 -2.181682e+00
## Distance_t1 -2.418028e-07
## Distance_t2 -2.560409e-08
Besides glmnet, cv.glment is also a commonly used logistic function to build models automatically using cross validation. For logistic regression, cv.glmnet has similar arguments and usage as Gaussian. nfolds, weights, lambda, parallel are all available to users.
The following model uses misclassification error as the criterion for 10-fold cross-validation, and then we plot the object and show the optimal values of λ .
cvfit <- cv.glmnet(as.matrix(data_train%>%select(-c('Season','D_t2Num','team1win','team1id','team2id','diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1'))),data_train$team1win,family="binomial", type.measure = "class")
plot(cvfit)
Make predictions on the held test data set and produces the probability of team1winnin prediction
pred_prob <- predict(glmnet_fit, newdata = data_test, type = 'prob')
Decide on optimal prediction probability cutoff for the model. The default cutoff prediction probability score is 0.5 or the ratio of 1’s and 0’s in the training data. But sometimes, tuning the probability cutoff can improve the accuracy in both the development and validation samples. The InformationValue::optimalCutoff function provides ways to find the optimal cutoff to improve the prediction of 1’s, 0’s, both 1’s and 0’s and o reduce the misclassification error. Lets compute the optimal score that minimizes the misclassification error for the above model.
optCutOff <- optimalCutoff(data_test$team1win, pred_prob[2])[1] #0.4785712
A confusion matrix is a table that is often used to describe the performance of a classification model (or “classifier”) on a set of test data for which the true values are known.
confusionMatrix(data_test$team1win, pred_prob[2],threshold = optCutOff)
## 0 1
## 0 91 43
## 1 18 61
ROC
Receiver Operating Characteristics Curve traces the percentage of true positives accurately predicted by a given logit model as the prediction probability cutoff is lowered from 1 to 0. For a good model, as the cutoff is lowered, it should mark more of actual 1’s as positives and lesser of actual 0’s as 1’s. So for a good model, the curve should rise steeply, indicating that the TPR (Y-Axis) increases faster than the FPR (X-Axis) as the cutoff score decreases. Greater the area under the ROC curve, better the predictive ability of the model.
plotROC(data_test$team1win, pred_prob[2])
skip for now, need to know how the home venue location was generated for each team
#distm(c(lon1, lat1), c(lon2, lat2), fun = distHaversine)